#Clear the environment
rm(list=ls())
#Loading libraries
library(tm)
library(proxy)
library(RTextTools)
library(fpc)
library(wordcloud)
library(cluster)
library(stringi)
library(dplyr)
library(ggplot2)
library(tidyr)
library(tidytext)
library(data.table)
library(cluster)
library(radarchart)
library(repr)
#Data Pre-processing
#Enron Emails
enron_vocab <- read.csv("dataset/vocab.enron.txt", header = FALSE, col.names = c("word"),
stringsAsFactors = FALSE) %>%
mutate(wordID = 1:28102)
enron_words <- read.csv("dataset/docword.enron.txt", header = FALSE, sep = " ",
col.names = c("docID", "wordID", "count"), skip = 3)
enron_words <- merge(enron_words, enron_vocab, by = "wordID") %>%
select(docID, word, count)
#NIPS data
nips_vocab <- read.csv("dataset/vocab.nips.txt", header = FALSE, col.names = c("word"),
stringsAsFactors = FALSE) %>%
mutate(wordID = 1:12419)
nips_words <- read.csv("dataset/docword.nips.txt", header = FALSE, sep = " ",
col.names = c("docID", "wordID", "count"), skip = 3)
nips_words <- merge(nips_words, nips_vocab, by = "wordID") %>%
select(docID, word, count)
#KOS data
kos_vocab <- read.csv("dataset/vocab.kos.txt", header = FALSE, col.names = c("word"),
stringsAsFactors = FALSE) %>%
mutate(wordID = 1:6906)
kos_words <- read.csv("dataset/docword.kos.txt", header = FALSE, sep = " ",
col.names = c("docID", "wordID", "count"), skip = 3)
kos_words <- merge(kos_words, kos_vocab, by = "wordID") %>%
select(docID, word, count)
#Exploration
enron_unique_words = aggregate(enron_words['count'], by=enron_words['word'], sum)
nips_unique_words = aggregate(nips_words['count'], by=nips_words['word'], sum)
kos_unique_words = aggregate(kos_words['count'], by=kos_words['word'], sum)
enron_top = enron_unique_words[order(enron_unique_words$count,decreasing=T)[1:10],]
nips_top = nips_unique_words[order(nips_unique_words$count,decreasing=T)[1:10],]
kos_top = kos_unique_words[order(kos_unique_words$count,decreasing=T)[1:10],]
g <- ggplot(data=enron_top, aes(x=enron_top$word, y=enron_top$count))
g + geom_bar(stat="identity") + xlab("Words") + ylab("Count") + ggtitle("Top 10 words")

g <- ggplot(data=nips_top, aes(x=nips_top$word, y=nips_top$count))
g + geom_bar(stat="identity") + xlab("Words") + ylab("Count") + ggtitle("Top 10 words")

g <- ggplot(data=kos_top, aes(x=kos_top$word, y=kos_top$count))
g + geom_bar(stat="identity") + xlab("Words") + ylab("Count") + ggtitle("Top 10 words")

# Sentiment Analysis
afinn <- get_sentiments("afinn")
nrc <- get_sentiments("nrc")
bing <- get_sentiments("bing")
# enron sentiment based on AFINN lexicon
enron_afinn <- enron_words %>%
# Inner Join to AFINN lexicon
inner_join(afinn, by = c("word" = "word")) %>%
# Count by score and document ID
count(score, docID)
enron_afinn_agg <- enron_afinn %>%
# Group by line
group_by(docID) %>%
# Sum scores by line
summarize(total_score = sum(score))
ggplot(enron_afinn_agg, aes(docID, total_score)) +
geom_smooth()

# enron sentiment based on nrc lexicon
enron_nrc <- inner_join(enron_words, nrc, by = c("word" = "word"))
# DataFrame of counts
enron_nrc <- enron_nrc %>%
# group by sentiment
group_by(sentiment) %>%
# total count by sentiment
summarize(total_count = sum(count))
# Plotting the sentiment counts
ggplot(enron_nrc, aes(x = sentiment, y = total_count)) +
geom_col()

# enron sentiment by bing lexicon
enron_bing <- enron_words %>%
# inner join to the lexicon
inner_join(bing, by = c("word" = "word")) %>%
# count by sentiment, index
count(sentiment, docID) %>%
# spreading the sentiments
spread(sentiment, n, fill=0) %>%
mutate(
# adding polarity field
polarity = positive - negative,
# adding line number field
docID = unique(docID)
)
# plotting the sentiment
ggplot(enron_bing, aes(docID, polarity)) +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
ggtitle("Enron Emails Chronological Polarity")

# enron frequency analysis
enron_sents <- inner_join(enron_words, bing, by = c("word" = "word"))
# tidy sentiment calculation
enron_tidy_sentiment <- enron_sents %>%
count(word, sentiment, wt = count) %>%
spread(sentiment, n, fill = 0) %>%
mutate(polarity = positive - negative)
# subsetting the data for words with high polarity
enron_tidy_small <- enron_tidy_sentiment %>%
filter(abs(polarity) >= 1000)
# adding polarity
enron_tidy_pol <- enron_tidy_small %>%
mutate(
pol = ifelse(polarity>0, "positive", "negative")
)
# plotting the word frequency
ggplot(
enron_tidy_pol,
aes(reorder(word, polarity), polarity, fill = pol)) +
geom_bar(stat = "identity") +
ggtitle("Enron Emails: Sentiment Word Frequency") +
theme(axis.text.x = element_text(angle = 90, vjust = -0.1))

# enron emotional introspection
enron_sentiment <- inner_join(enron_words, nrc)
# dropping positive or negative
enron_pos_neg <- enron_sentiment %>%
filter(!grepl("positive|negative", sentiment))
# counting terms by sentiment then spread
enron_tidy <- enron_pos_neg %>%
count(sentiment, term = word) %>%
spread(sentiment, n, fill = 0) %>%
as.data.frame()
# setting row names
rownames(enron_tidy) <- enron_tidy[, 1]
# dropping terms column
enron_tidy[, 1] <- NULL
# comparison cloud
comparison.cloud(enron_tidy, max.words = 200, title.size = 1.5)

# enron radarchart
enron_sentiment <- inner_join(enron_words, nrc)
# dropping positive or negative
enron_pos_neg <- enron_sentiment %>%
filter(!grepl("positive|negative", sentiment))
# tidy count
enron_tally <- enron_pos_neg %>%
group_by(sentiment) %>%
tally()
# JavaScript radar chart
chartJSRadar(enron_tally)
#Document Term Matrix
enron_dtm <- enron_words %>%
cast_dtm(docID, word, count)
nips_dtm <- nips_words %>%
cast_dtm(docID, word, count)
kos_dtm <- kos_words %>%
cast_dtm(docID, word, count)
inspect(enron_dtm)
## <<DocumentTermMatrix (documents: 39861, terms: 28099)>>
## Non-/sparse entries: 3710420/1116343819
## Sparsity : 100%
## Maximal term length: 49
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs business california company customer electricity energy market
## 23414 72 3 49 33 0 1 20
## 25892 5 1 12 1 0 2 7
## 28717 3 43 14 16 35 45 11
## 31297 25 44 10 17 25 51 13
## 33802 19 80 26 40 49 25 17
## 33940 5 1 12 1 0 2 7
## 33959 55 11 80 35 0 1 25
## 33971 72 3 49 33 0 1 20
## 34007 25 44 10 17 25 51 13
## 34027 3 43 14 16 35 45 11
## Terms
## Docs meeting plan power
## 23414 1 20 0
## 25892 13 13 6
## 28717 7 24 59
## 31297 3 38 55
## 33802 4 22 116
## 33940 13 13 6
## 33959 6 20 1
## 33971 1 20 0
## 34007 3 38 55
## 34027 7 24 59
inspect(nips_dtm)
## <<DocumentTermMatrix (documents: 1500, terms: 12375)>>
## Non-/sparse entries: 746316/17816184
## Sparsity : 96%
## Maximal term length: 23
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs algorithm data function input learning model network neural set
## 1090 0 2 3 22 4 28 8 9 6
## 1461 2 29 3 1 22 36 1 0 4
## 1471 5 10 7 15 34 6 17 7 33
## 178 0 0 18 3 13 6 70 38 4
## 4 17 28 35 21 26 11 16 10 10
## 512 5 7 2 16 4 33 7 1 16
## 56 1 5 15 6 4 5 32 21 21
## 682 5 1 4 30 18 3 58 9 5
## 746 1 1 1 49 3 4 37 9 8
## 919 0 2 2 24 26 17 42 21 2
## Terms
## Docs system
## 1090 13
## 1461 15
## 1471 8
## 178 6
## 4 21
## 512 48
## 56 22
## 682 2
## 746 5
## 919 15
inspect(kos_dtm)
## <<DocumentTermMatrix (documents: 3430, terms: 6906)>>
## Non-/sparse entries: 353160/23334420
## Sparsity : 99%
## Maximal term length: 40
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs bush campaign democratic general house iraq kerry november poll war
## 1037 1 0 1 18 2 4 2 11 2 2
## 1197 25 7 3 3 1 3 8 2 1 0
## 1304 3 0 2 6 3 10 2 11 2 4
## 1739 12 2 4 0 1 1 1 1 1 0
## 1832 17 0 0 0 7 29 0 0 0 2
## 1955 2 6 3 1 2 0 4 10 5 1
## 232 3 4 1 0 5 1 1 0 11 1
## 2355 1 6 14 1 1 0 0 1 0 0
## 3411 5 4 3 1 2 0 5 10 4 1
## 480 1 0 1 2 1 2 6 0 0 2
#TF-IDF Weighting Scheme
enron_dtm_tfidf <- weightTfIdf(enron_dtm, normalize = TRUE)
nips_dtm_tfidf <- weightTfIdf(nips_dtm, normalize = TRUE)
kos_dtm_tfidf <- weightTfIdf(kos_dtm, normalize = TRUE)
inspect(enron_dtm_tfidf)
## <<DocumentTermMatrix (documents: 39861, terms: 28099)>>
## Non-/sparse entries: 3710420/1116343819
## Sparsity : 100%
## Maximal term length: 49
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample :
## Terms
## Docs attached date market meeting number office report
## 1487 0.01350825 0 0.00000000 0 0 0 0.07622168
## 19095 0.00000000 0 0.00000000 0 0 0 0.00000000
## 19314 0.00000000 0 0.00000000 0 0 0 0.00000000
## 19374 0.00000000 0 0.00000000 0 0 0 0.00000000
## 21785 0.00000000 0 0.00351855 0 0 0 0.00000000
## 21922 0.00000000 0 0.00000000 0 0 0 0.00000000
## 27395 0.00000000 0 0.00000000 0 0 0 0.00000000
## 5686 0.00000000 0 0.00000000 0 0 0 0.00000000
## 5750 0.00000000 0 0.00000000 0 0 0 0.00000000
## 6492 0.00000000 0 0.00000000 0 0 0 0.00000000
## Terms
## Docs request team think
## 1487 0 0.00000000 0.02748471
## 19095 0 0.00000000 0.00000000
## 19314 0 0.00000000 0.00000000
## 19374 0 0.00000000 0.00000000
## 21785 0 0.00000000 0.00000000
## 21922 0 0.00000000 0.00000000
## 27395 0 0.00000000 0.00000000
## 5686 0 0.00000000 0.00000000
## 5750 0 0.01331161 0.00000000
## 6492 0 0.00000000 0.00000000
inspect(nips_dtm_tfidf)
## <<DocumentTermMatrix (documents: 1500, terms: 12375)>>
## Non-/sparse entries: 746316/17816184
## Sparsity : 96%
## Maximal term length: 23
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample :
## Terms
## Docs cell circuit image learning neuron object speech training
## 1088 0 0 0 0.000000000 0.00000000 0 0 0
## 1227 0 0 0 0.000000000 0.00000000 0 0 0
## 1365 0 0 0 0.000000000 0.00000000 0 0 0
## 1500 0 0 0 0.000000000 0.00000000 0 0 0
## 553 0 0 0 0.000000000 0.00000000 0 0 0
## 697 0 0 0 0.000000000 0.00000000 0 0 0
## 819 0 0 0 0.000000000 0.00000000 0 0 0
## 83 0 0 0 0.005527718 0.04293003 0 0 0
## 86 0 0 0 0.000000000 0.00000000 0 0 0
## 954 0 0 0 0.000000000 0.00000000 0 0 0
## Terms
## Docs unit visual
## 1088 0 0
## 1227 0 0
## 1365 0 0
## 1500 0 0
## 553 0 0
## 697 0 0
## 819 0 0
## 83 0 0
## 86 0 0
## 954 0 0
inspect(kos_dtm_tfidf)
## <<DocumentTermMatrix (documents: 3430, terms: 6906)>>
## Non-/sparse entries: 353160/23334420
## Sparsity : 99%
## Maximal term length: 40
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample :
## Terms
## Docs bush clark dean edwards iraq kerry november percent poll
## 1639 0.00000000 0 0 0.00000000 0 0 0 0 0
## 1685 0.00000000 0 0 0.00000000 0 0 0 0 0
## 195 0.00000000 0 0 0.03764289 0 0 0 0 0
## 196 0.00000000 0 0 0.06984344 0 0 0 0 0
## 2098 0.00000000 0 0 0.00000000 0 0 0 0 0
## 2649 0.00000000 0 0 0.00000000 0 0 0 0 0
## 2757 0.00000000 0 0 0.00000000 0 0 0 0 0
## 328 0.00607109 0 0 0.00000000 0 0 0 0 0
## 382 0.00000000 0 0 0.00000000 0 0 0 0 0
## 838 0.00000000 0 0 0.00000000 0 0 0 0 0
## Terms
## Docs senate
## 1639 0.000000000
## 1685 0.011717515
## 195 0.000000000
## 196 0.000000000
## 2098 0.000000000
## 2649 0.000000000
## 2757 0.000000000
## 328 0.000000000
## 382 0.000000000
## 838 0.008218167
dtm.matrix = as.matrix(enron_dtm_tfidf[1:1000,])
set.seed(142)
wordcloud(colnames(dtm.matrix), dtm.matrix[3, ], max.words = 200, rot.per=0.2, colors=brewer.pal(6, "Dark2"))

dtm.matrix = as.matrix(nips_dtm_tfidf[1:1000,])
set.seed(142)
wordcloud(colnames(dtm.matrix), dtm.matrix[3, ], max.words = 200, rot.per=0.2, colors=brewer.pal(6, "Dark2"))

dtm.matrix = as.matrix(kos_dtm_tfidf[1:1000,])
set.seed(142)
wordcloud(colnames(dtm.matrix), dtm.matrix[3, ], max.words = 200, rot.per=0.2, colors=brewer.pal(6, "Dark2"))

# Distance Matrix using Cosine Measure
em <- as.matrix(enron_dtm_tfidf[1:1000,])
enron_distMatrix <- dist(em, method="cosine")
nm <- as.matrix(nips_dtm_tfidf[1:20,])
nips_distMatrix <- dist(nm, method="cosine")
km <- as.matrix(kos_dtm_tfidf[1:20,])
kos_distMatrix <- dist(km, method="cosine")
m2 <- as.matrix(enron_dtm_tfidf[1:20,])
distMatrix2 <- dist(m2, method="cosine")
#Clustering
#Hierarchical Clustering
groups <- hclust(distMatrix2, method="ward.D")
plot(groups, cex=0.9, hang=-1)
rect.hclust(groups, k=7)

groups <- hclust(nips_distMatrix, method="ward.D")
plot(groups, cex=0.9, hang=-1)
rect.hclust(groups, k=3)

groups <- hclust(kos_distMatrix, method="ward.D")
plot(groups, cex=0.9, hang=-1)
rect.hclust(groups, k=4)

#K Means Clustering algorithm
kfit <- kmeans(distMatrix2, 7, nstart=100)
clusplot(as.matrix(distMatrix2), kfit$cluster, color=T, shade=T, labels=2, lines=0)

kfit <- kmeans(nips_distMatrix, 3, nstart=100)
clusplot(as.matrix(nips_distMatrix), kfit$cluster, color=T, shade=T, labels=2, lines=0)

kfit <- kmeans(kos_distMatrix, 4, nstart=100)
clusplot(as.matrix(kos_distMatrix), kfit$cluster, color=T, shade=T, labels=2, lines=0)

#Kmeans
#look for “elbow” in plot of summed intra-cluster distances (withinss) as fn of k
wss <- 2:30
for (i in 2:30) wss[i] <- sum(kmeans(enron_distMatrix,centers=i,nstart=25)$withinss)
plot(2:30, wss[2:30], type="b", xlab="Number of Clusters",ylab="Within groups sum of squares")
